#!/usr/bin/env python
##import modules
import re
import glob

##updating 032821 change _ to | for the reference

#############################################################
##Step 1 construct the consensus location for all the samples
#############################################################
def construct_consensus_loc (input_pop_dir):

    ##updation11.28: initiate a final line to store
    store_final_line_list = []

    ##updation 11.25: initiate a dic to store the number of te_line
    #sample_number = '176' ##this will be generated by later scripts

    ##updation 11.30: generate sample number
    store_sample_dic = {}
    Yen_pop_files = glob.glob(input_pop_dir + '/*.bed')
    for eachfile in Yen_pop_files:
        mt = re.match('.+\/(.+)', eachfile)  ##the name should be considered for other situations.
        file_full_nm = mt.group(1)
        mt = re.match('(.+?)_.+',file_full_nm)
        file_nm = mt.group(1)
        store_sample_dic[file_nm] = 1
    sample_number = str(len(store_sample_dic.keys()))

    ##updation 050220
    ##add a temp dic to store the te_line with the fl_nm
    store_te_line_fl_nm_dic = {}

    store_te_line_number_sample_str_dic = {}

    Yen_pop_files = glob.glob(input_pop_dir + '/*.bed')

    for eachfile in Yen_pop_files:

        mt = re.match('.+\/(.+)', eachfile)  ##the name should be considered for other situations.
        file_full_nm = mt.group(1)
        mt = re.match('(.+?)_.+',file_full_nm)
        file_nm = mt.group(1)

        with open (eachfile,'r') as ipt:
            for line in ipt:
                if not re.match('^track.+', line) and not re.match('^#.+',line):   ##this is the style of the opt from mcclintock
                    line = line.strip('\n')
                    col = line.strip().split()
                    te_nm = ''

                    ##updation 072520 change the insertion locus for the temp tool in mcclintock
                    new_st = col[1]
                    new_ed = col[2]

                    ##updation 042120
                    if 'reference' in col[3]:  ##indicate this format is from mcclintock
                        mt_1 = re.match('^(.+)\|reference.+', col[3])
                        if mt_1:
                            te_nm = mt_1.group(1)
                        mt_2 = re.match('^(.+)\|non-reference.+', col[3])
                        if mt_2:
                            te_nm = mt_2.group(1)

                            ##updation 072520
                            if (int(col[2]) - int(col[1])) > 2:
                                new_st = col[1]
                                new_ed = str(int(col[1]) + 1)

                    else:
                        te_nm = col[3]

                    ##updation3.18: add the direction in formation in the end of the te_line
                    te_line =  col[0] + '\t' + new_st + '\t' + new_ed + '\t' + te_nm + '\t' + col[-1]

                    ##updation 050220: add the file name in the te_line
                    te_line_fl_nm = col[0] + '\t' + new_st + '\t' + new_ed + '\t' + te_nm + '\t' + col[-1] + '\t' + file_nm

                    ##updation11.25: directly store the te_line and calcualte the number of each te_line
                    ##generate a final line like the following mentioned:
                    #final_line = eachline + '\t' + str(y_num) + '\t' + str(n_num) + '\t' + str(perc) + '\t' + sample_string

                    ##updation 050220
                    ##this condition suggests te_line_fl_nm is not detected before either in same sample or not
                    if te_line_fl_nm not in store_te_line_fl_nm_dic:
                        store_te_line_fl_nm_dic[te_line_fl_nm] = 1

                        if te_line in store_te_line_number_sample_str_dic:

                            ##updation 050220
                            ##check if this file_nm occur before
                            ##<2 means 1 that file only appear for one time before
                            #if check_count_sample_dic[file_nm] == 1:
                            ##it does not work since we cannot distinguish this same te_line is from same sample or from different sample

                            store_te_line_number_sample_str_dic[te_line]['in_num'] += 1
                            store_te_line_number_sample_str_dic[te_line]['sp_str'] = store_te_line_number_sample_str_dic[te_line]['sp_str'] + ';' + file_nm

                        else:
                            store_te_line_number_sample_str_dic[te_line] = {'in_num':1,'sp_str':file_nm}
                            #store_te_line_number_sample_str_dic[te_line] = {'sp_str':file_nm}

    ##updation 11.28
    for each_te_line in store_te_line_number_sample_str_dic:
        te_line_col = each_te_line.strip().split()
        #print(each_te_line)
        chr = te_line_col[0]
        st = te_line_col[1]
        ed = te_line_col[2]
        nm = te_line_col[3]
        dr = te_line_col[4]
        in_num = str(store_te_line_number_sample_str_dic[each_te_line]['in_num'])
        no_in_num = str(int(sample_number) - int(in_num))
        perc = ''
        sp_str = store_te_line_number_sample_str_dic[each_te_line]['sp_str']
        if int(in_num) > int(no_in_num):
            perc = str(int(no_in_num)/int(sample_number))
        if int(in_num) < int(no_in_num):
            perc = str(int(in_num)/int(sample_number))
        if int(in_num) == int(no_in_num):
            perc = '0.5'

        final_line = chr + '\t' + st + '\t' + ed + '\t' + nm + '\t' + dr + '\t' + in_num + '\t' + no_in_num + '\t' + perc + '\t' + sp_str
        store_final_line_list.append(final_line)

                    #ref_line_dic[te_line] = 1

    return (store_final_line_list,sample_number)

#########################################
##Step 2 Genotype the TEs for each sample
#########################################
##define a function to merge two dics that will be used for the following scripts
def Merge(dic1,dic2):
    dic2.update(dic1)
    return (dic2)

##define a function to store TE line that will be used for the following scripts
def store_te (ref_file):
    TE_dic = {}
    with open (ref_file,'r') as ipt_ref:
        for line in ipt_ref:
            if not re.match('^track.+',line) and not re.match('^#.+',line):
                col = line.strip().split()
                te_ref = ''

                ##updation 072520 change the insertion locus for the temp tool in mcclintock
                new_st = col[1]
                new_ed = col[2]

                ##updation 042120 consider other format
                if 'reference' in col[3]: ##indicate this format is from mcclintock
                    mt_1 = re.match('^(.+)\|reference.+',col[3])
                    if mt_1:
                        te_ref = mt_1.group(1)
                    mt_2 = re.match('^(.+)\|non-reference.+',col[3])
                    if mt_2:
                        te_ref = mt_2.group(1)

                        ##updation 072520
                        if (int(col[2]) - int(col[1])) > 2:
                            new_st = col[1]
                            new_ed = str(int(col[1]) + 1)

                else:##indicate this format is modified from other tools
                    ##other format: col[3] is the TE name
                    te_ref = col[3]

                ##updation3.18: add the direction information in the end of the te_line
                te_line = col[0] + '\t' + new_st + '\t' + new_ed + '\t' + te_ref + '\t' + col[-1]

                TE_dic[te_line] = 1
    return(TE_dic)

def genotype_TEs (input_pop_dir,ref_line_file):

    ##updation 11.21
    ref_line_dic = {}
    ##updation 11.21 read ref_line_file to ref_line_dic
    with open (ref_line_file,'r') as ipt:
        for eachline in ipt:
            eachline = eachline.strip('\n')
            ref_line_dic[eachline] = 1

    ##initiate a dic to store all the information
    ##the key is the name of the file and the value is the dic storing genotype_line information
    all_geno_fl_dic = {}

    ##get the list which contains different paired lists of results
    pop_fst = glob.glob(input_pop_dir + '/*.bed')
    ##get the all name of sample files
    pop_nm_dic = {}
    for eachfile in pop_fst:
        mt = re.match('.+\/(.+?)_.+', eachfile)
        ##the name should be considered for other situations.
        ##other tools need to change the format to be consistent with name_XXXX.bed XXX could be method name from different tools
        file_nm = mt.group(1)
        pop_nm_dic[file_nm] = 1

    ##conduct analysis for each name in the pop file
    for eachnm in pop_nm_dic:
        print ('genotype_TEs: the analyzed pop name is ' + eachnm)
        te_pop_dic = {}
        for eachfile in pop_fst:
            if eachnm in eachfile:
                te_dic = store_te(eachfile)
                te_pop_dic = Merge(te_dic, te_pop_dic)

        ##initiate a dic to store all the information for each sample
        te_genotype_dic = {}
        ##identification of TEs in the consensus file or not and mark the TEs with 'y' or 'n'
        for eachline in ref_line_dic:
            if eachline in te_pop_dic.keys():
                ##the genotype line stores information about the line and markers with 'y' or 'n'
                genotype_line = eachline + '\t' + 'y'
                te_genotype_dic[genotype_line] = 1
            else:
                genotype_line = eachline + '\t' + 'n'
                te_genotype_dic[genotype_line] = 1

        all_geno_fl_dic[eachnm] = te_genotype_dic

    return (all_geno_fl_dic)


##########################################
##Step 3 Filter TEs that are below the MAF
##########################################
##specify the value to filter TE
##do not set the filter_value
##only filter all the reads showing the same variance

##updation 11.21
def filter_MAF (all_geno_fl_dic,ref_line_file):

    ref_line_dic = {}
    ##updation 11.21 read ref_line_file to ref_line_dic
    with open (ref_line_file,'r') as ipt:
        for eachline in ipt:
            eachline = eachline.strip('\n')
            ref_line_dic[eachline] = 1


    ##initate a dic to store the genotyped TEs with the filtration
    final_line_dic = {}

    ##get the total number of samples
    total_num = len(list(all_geno_fl_dic.keys()))

    cal_geno_y_dic = {}
    cal_geno_n_dic = {}

    ##updation 8.11 create a dic to store the sample information
    store_sample_dic = {}
    ##analyze each file
    for eachnm in all_geno_fl_dic:
        ##analyze each line for eachfile
        for each_geno_line in all_geno_fl_dic[eachnm]:

            each_geno_line = each_geno_line.strip('\n')
            col = each_geno_line.strip().split()

            ##the marker means the 'y' or 'n'
            marker = col[-1]
            loc_line = col[0] + '\t' + col[1] + '\t' + col[2] + '\t' + col[3] + '\t' + col[4]


            ##updation 8.11 add the name specific for each loc with y
            ##store the y line into cal_geno_y_dic
            if marker == 'y':
                if loc_line in cal_geno_y_dic:
                    cal_geno_y_dic[loc_line]['y'] += 1
                    #cal_geno_y_dic[loc_line]['sample'] = cal_geno_y_dic[loc_line]['sample'] + ';' +eachnm
                    store_sample_dic[loc_line]['sample_str'] = store_sample_dic[loc_line]['sample_str'] + ';' + eachnm
                else:
                    cal_geno_y_dic[loc_line] = {'y':1}
                    ##add sample information
                    store_sample_dic[loc_line] = {'sample_str':eachnm}

            ##store the n line into cal_geno_n_dic
            if marker == 'n':
                if loc_line in cal_geno_n_dic:
                    cal_geno_n_dic[loc_line]['n'] += 1
                else:
                    cal_geno_n_dic[loc_line] = {'n':1}

    ##filter the TE loc
    ##if y > n, n_num/all_sample > 0.1, which will be kept
    ##if y < n, y_num/all_sample > 0.1, which will be kept
    count = 0
    for eachline in ref_line_dic:
        count += 1
        print ('filter_MAF: the analyzed each te line is ' + str(count))
        ##if the loc is in cal_geno_y_dic and cal_geno_n_dic,
        ##if the loc only exit in single dic, which means the MAF = 0, this will be filtered out
        if eachline in cal_geno_y_dic.keys() and eachline in cal_geno_n_dic.keys():

            y_num = cal_geno_y_dic[eachline]['y']
            n_num = cal_geno_n_dic[eachline]['n']

            ##updation8.11 add sample information
            sample_string = store_sample_dic[eachline]['sample_str']

            if y_num > n_num:
                perc = n_num/total_num
                #if float(perc) > float(input_sample_thr):
                    ##this loc will be kept:
                final_line = eachline + '\t' + str(y_num) + '\t' + str(n_num) + '\t' + str(perc) + '\t' + sample_string
                final_line_dic[final_line] = 1

            if y_num < n_num:
                perc = y_num/total_num
                #if float(perc) > float(input_sample_thr):
                    ##this loc will be kept:
                final_line = eachline + '\t' + str(y_num) + '\t' + str(n_num) + '\t' + str(perc) + '\t' + sample_string
                final_line_dic[final_line] = 1

            if y_num == n_num:
                final_line = eachline + '\t' + str(y_num) + '\t' + str(n_num) + '\t' + '0.5' + '\t' + sample_string
                final_line_dic[final_line] = 1

    return (final_line_dic)
































